{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import re\n",
    "from pprint import pprint\n",
    "raw_df = pd.read_excel(\"测序数据统计-早筛组 20190211-20200207实验记录存档.20200427.xlsx\", sheet_name=\"测序信息登记\")\n",
    "capture_df = pd.read_excel(\"pooling.20200428.xlsx\", sheet_name=\"capture\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "(795, 6)"
     },
     "metadata": {},
     "execution_count": 2
    }
   ],
   "source": [
    "# raw_df列切片去重，对日期进行处理\n",
    "raw_df_filt = raw_df.loc[:, [\"Pooling号\", \"送测日期\", \"上机日期\", \"下机日期\", \"机器号\", \"芯片号\"]]\n",
    "tmp = [ x if pd.isna(x) else str(int(x)) for x in raw_df_filt.loc[:,\"送测日期\"].to_list()]\n",
    "raw_df_filt[\"送测日期\"] = [ x if pd.isna(x) else x[:4]+\"/\"+x[4:6]+\"/\"+x[6:] for x in  tmp]\n",
    "raw_df_filt[\"上机日期\"] = raw_df_filt[\"上机日期\"].apply(lambda x: \"\" if x==\"无\" or pd.isna(x) else pd.to_datetime(x).strftime('%Y/%m/%d'))\n",
    "raw_df_filt[\"下机日期\"] = raw_df_filt[\"下机日期\"].apply(lambda x: \"\" if x==\"无\" or pd.isna(x) else pd.to_datetime(x).strftime('%Y/%m/%d'))\n",
    "raw_df_filt = raw_df_filt.fillna(method='ffill')\n",
    "raw_df_filt = raw_df_filt.drop_duplicates()\n",
    "raw_df_filt.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "<bound method NDFrame.head of     虚拟入库样本名1        送测日期        上机时间        下机时间                          机器号  \\\n0    IDTL130         NaN  2018/12/27  2018/12/29                R100400180038   \n1    IDTL131         NaN  2018/12/27  2018/12/29                R100400180038   \n2    IDTL142         NaN  2019/01/12  2019/01/14                R100400180015   \n3    IDTL145         NaN  2019/01/12  2019/01/14                 100400180007   \n4      LM001         NaN  2019/01/16  2019/01/18                R100400180015   \n..       ...         ...         ...         ...                          ...   \n206    LM285  2020/02/03  2020/02/04  2020/02/06  100400180014  R100400180013   \n207    LM286  2020/02/04  2020/02/05  2020/02/07                R100400180038   \n208    LM287  2020/02/04  2020/02/05  2020/02/07                R100400180120   \n209    LM287  2020/02/04  2020/02/05  2020/02/07                R100400180120   \n210    LM289  2020/02/04  2020/02/05  2020/02/07                R100400180025   \n\n                          芯片号                   捕获文库名  \n0      V300012489, V300012514  ssM-CpGiant-1-20181218  \n1      V300012489, V300012514  ssM-CpGiant-2-20181218  \n2    V300012566, V300012600,   ssM-CpGiant-1-20190104  \n3      V300012612, V300012618  ssM-CpGiant-4-20190104  \n4      V300012535, V300012554  ssM-CpGiant-2-20190109  \n..                        ...                     ...  \n206    V300032757, V300032653  ssM-CpGiant-4-20200131  \n207    V300044882, V300046512  ssM-CpGiant-5-20200131  \n208    V300044876, V300044921  ssM-CpGiant-6-20200131  \n209    V300044876, V300044921  ssM-CpGiant-7-20200131  \n210    V300044901, V300044737  ssM-CpGiant-8-20200131  \n\n[211 rows x 7 columns]>"
     },
     "metadata": {},
     "execution_count": 4
    }
   ],
   "source": [
    "#根据Pooling号合并raw_df_filt和capture_df\n",
    "raw_df_filt.loc[:, \"Pooling号\"] = raw_df_filt.loc[:, \"Pooling号\"].str.strip()\n",
    "capture_df.loc[:, \"虚拟入库样本名\"] = capture_df.loc[:, \"虚拟入库样本名\"].str.strip()\n",
    "merge_df = pd.merge(raw_df_filt, capture_df, left_on=\"Pooling号\", right_on=\"虚拟入库样本名\")\n",
    "merge_df.columns = ['虚拟入库样本名1', '送测日期', '上机时间', '下机时间', '机器号', '芯片号', '捕获文库名', '虚拟入库样本名2',\n",
    "       '杂交日期', '杂交探针', '杂交时间h', 'PostPCR循环数', 'PostPCR浓度ng/ul', '洗脱体积', '操作人',\n",
    "       '备注']\n",
    "merge_df_filt = merge_df.loc[:,['虚拟入库样本名1', '送测日期', '上机时间', '下机时间', '机器号', '芯片号', '捕获文库名']]\n",
    "# merge_df_filt.head"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "#列“机器号”进行处理\n",
    "tmp = []\n",
    "for x in merge_df_filt.loc[:, \"机器号\"]:\n",
    "    x = re.sub(r'R', \"\", str(x).strip())\n",
    "    x = re.sub(r'(\\d+)', r'R\\1', str(x))\n",
    "    x = re.sub(r'\\s+', r', ', str(x))\n",
    "    tmp.append(x)\n",
    "merge_df_filt.loc[:, \"机器号\"] = tmp\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "count = {}\n",
    "tmp = []\n",
    "for idx in range(merge_df_filt.shape[0]):\n",
    "    k = re.sub('/', \"\", merge_df_filt.loc[idx, \"上机时间\"])\n",
    "    k = \"seq_\"+k\n",
    "    if k in count:\n",
    "        if merge_df_filt.loc[idx, \"机器号\"] not in count[k]:\n",
    "            count[k].append(merge_df_filt.loc[idx, \"机器号\"])\n",
    "\n",
    "    else:\n",
    "        count[k]=[merge_df_filt.loc[idx, \"机器号\"]]\n",
    "\n",
    "    \n",
    "    idx2 = count[k].index(merge_df_filt.loc[idx, \"机器号\"])\n",
    "    v = \"{}_{}\".format(k, idx2+1)\n",
    "    tmp.append(v)\n",
    "\n",
    "merge_df_filt.loc[:, \"测序编号\"] = tmp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "    虚拟入库样本名1        送测日期        上机时间        下机时间  \\\n0    IDTL130         NaN  2018/12/27  2018/12/29   \n1    IDTL131         NaN  2018/12/27  2018/12/29   \n2    IDTL142         NaN  2019/01/12  2019/01/14   \n3    IDTL145         NaN  2019/01/12  2019/01/14   \n4      LM001         NaN  2019/01/16  2019/01/18   \n..       ...         ...         ...         ...   \n206    LM285  2020/02/03  2020/02/04  2020/02/06   \n207    LM286  2020/02/04  2020/02/05  2020/02/07   \n208    LM287  2020/02/04  2020/02/05  2020/02/07   \n209    LM287  2020/02/04  2020/02/05  2020/02/07   \n210    LM289  2020/02/04  2020/02/05  2020/02/07   \n\n                              机器号                       芯片号  \\\n0                   R100400180038    V300012489, V300012514   \n1                   R100400180038    V300012489, V300012514   \n2                   R100400180015  V300012566, V300012600,    \n3                   R100400180007    V300012612, V300012618   \n4                   R100400180015    V300012535, V300012554   \n..                            ...                       ...   \n206  R100400180014, R100400180013    V300032757, V300032653   \n207                 R100400180038    V300044882, V300046512   \n208                 R100400180120    V300044876, V300044921   \n209                 R100400180120    V300044876, V300044921   \n210                 R100400180025    V300044901, V300044737   \n\n                      捕获文库名            测序编号  \n0    ssM-CpGiant-1-20181218  seq_20181227_1  \n1    ssM-CpGiant-2-20181218  seq_20181227_1  \n2    ssM-CpGiant-1-20190104  seq_20190112_1  \n3    ssM-CpGiant-4-20190104  seq_20190112_2  \n4    ssM-CpGiant-2-20190109  seq_20190116_1  \n..                      ...             ...  \n206  ssM-CpGiant-4-20200131  seq_20200204_2  \n207  ssM-CpGiant-5-20200131  seq_20200205_1  \n208  ssM-CpGiant-6-20200131  seq_20200205_2  \n209  ssM-CpGiant-7-20200131  seq_20200205_2  \n210  ssM-CpGiant-8-20200131  seq_20200205_3  \n\n[211 rows x 8 columns]",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>虚拟入库样本名1</th>\n      <th>送测日期</th>\n      <th>上机时间</th>\n      <th>下机时间</th>\n      <th>机器号</th>\n      <th>芯片号</th>\n      <th>捕获文库名</th>\n      <th>测序编号</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <td>0</td>\n      <td>IDTL130</td>\n      <td>NaN</td>\n      <td>2018/12/27</td>\n      <td>2018/12/29</td>\n      <td>R100400180038</td>\n      <td>V300012489, V300012514</td>\n      <td>ssM-CpGiant-1-20181218</td>\n      <td>seq_20181227_1</td>\n    </tr>\n    <tr>\n      <td>1</td>\n      <td>IDTL131</td>\n      <td>NaN</td>\n      <td>2018/12/27</td>\n      <td>2018/12/29</td>\n      <td>R100400180038</td>\n      <td>V300012489, V300012514</td>\n      <td>ssM-CpGiant-2-20181218</td>\n      <td>seq_20181227_1</td>\n    </tr>\n    <tr>\n      <td>2</td>\n      <td>IDTL142</td>\n      <td>NaN</td>\n      <td>2019/01/12</td>\n      <td>2019/01/14</td>\n      <td>R100400180015</td>\n      <td>V300012566, V300012600,</td>\n      <td>ssM-CpGiant-1-20190104</td>\n      <td>seq_20190112_1</td>\n    </tr>\n    <tr>\n      <td>3</td>\n      <td>IDTL145</td>\n      <td>NaN</td>\n      <td>2019/01/12</td>\n      <td>2019/01/14</td>\n      <td>R100400180007</td>\n      <td>V300012612, V300012618</td>\n      <td>ssM-CpGiant-4-20190104</td>\n      <td>seq_20190112_2</td>\n    </tr>\n    <tr>\n      <td>4</td>\n      <td>LM001</td>\n      <td>NaN</td>\n      <td>2019/01/16</td>\n      <td>2019/01/18</td>\n      <td>R100400180015</td>\n      <td>V300012535, V300012554</td>\n      <td>ssM-CpGiant-2-20190109</td>\n      <td>seq_20190116_1</td>\n    </tr>\n    <tr>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n      <td>...</td>\n    </tr>\n    <tr>\n      <td>206</td>\n      <td>LM285</td>\n      <td>2020/02/03</td>\n      <td>2020/02/04</td>\n      <td>2020/02/06</td>\n      <td>R100400180014, R100400180013</td>\n      <td>V300032757, V300032653</td>\n      <td>ssM-CpGiant-4-20200131</td>\n      <td>seq_20200204_2</td>\n    </tr>\n    <tr>\n      <td>207</td>\n      <td>LM286</td>\n      <td>2020/02/04</td>\n      <td>2020/02/05</td>\n      <td>2020/02/07</td>\n      <td>R100400180038</td>\n      <td>V300044882, V300046512</td>\n      <td>ssM-CpGiant-5-20200131</td>\n      <td>seq_20200205_1</td>\n    </tr>\n    <tr>\n      <td>208</td>\n      <td>LM287</td>\n      <td>2020/02/04</td>\n      <td>2020/02/05</td>\n      <td>2020/02/07</td>\n      <td>R100400180120</td>\n      <td>V300044876, V300044921</td>\n      <td>ssM-CpGiant-6-20200131</td>\n      <td>seq_20200205_2</td>\n    </tr>\n    <tr>\n      <td>209</td>\n      <td>LM287</td>\n      <td>2020/02/04</td>\n      <td>2020/02/05</td>\n      <td>2020/02/07</td>\n      <td>R100400180120</td>\n      <td>V300044876, V300044921</td>\n      <td>ssM-CpGiant-7-20200131</td>\n      <td>seq_20200205_2</td>\n    </tr>\n    <tr>\n      <td>210</td>\n      <td>LM289</td>\n      <td>2020/02/04</td>\n      <td>2020/02/05</td>\n      <td>2020/02/07</td>\n      <td>R100400180025</td>\n      <td>V300044901, V300044737</td>\n      <td>ssM-CpGiant-8-20200131</td>\n      <td>seq_20200205_3</td>\n    </tr>\n  </tbody>\n</table>\n<p>211 rows × 8 columns</p>\n</div>"
     },
     "metadata": {},
     "execution_count": 18
    }
   ],
   "source": [
    "merge_df_filt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "tmp_dict = {}\n",
    "for col in merge_df_filt.columns:\n",
    "    tmp_dict[col] = []\n",
    "\n",
    "for idx in range(merge_df_filt.shape[0]):\n",
    "    try:\n",
    "        idx2 = tmp_dict[\"测序编号\"].index(merge_df_filt.loc[idx, \"测序编号\"])\n",
    "        tmp_dict[\"捕获文库名\"][idx2] = tmp_dict[\"捕获文库名\"][idx2] + \", \" + merge_df_filt.loc[idx, \"捕获文库名\"]\n",
    "        tmp_dict[\"虚拟入库样本名1\"][idx2] = tmp_dict[\"虚拟入库样本名1\"][idx2] + \", \" + merge_df_filt.loc[idx, \"虚拟入库样本名1\"]\n",
    "    except ValueError:\n",
    "        for col in merge_df_filt.columns:\n",
    "            tmp_dict[col].append(merge_df_filt.loc[idx, col])\n",
    "\n",
    "for idx in range(len(tmp_dict[\"虚拟入库样本名1\"])):\n",
    "    id_list = tmp_dict[\"虚拟入库样本名1\"][idx].split(\", \")\n",
    "    if len(id_list)>1:\n",
    "        tmp_dict[\"虚拟入库样本名1\"][idx]=\", \".join(list(set(id_list)))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.DataFrame(tmp_dict).to_excel(\"sequence.20200428.xlsx\", sheet_name=\"测序信息表\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "Timestamp('2018-12-27 11:32:00')"
     },
     "metadata": {},
     "execution_count": 32
    }
   ],
   "source": [
    "pd.to_datetime(merge_df_filt.loc[0,\"上机时间\"], format='%Y/%m/%d')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "0      seq_20181227\n1      seq_20181227\n2      seq_20190112\n3      seq_20190112\n4      seq_20190116\n           ...     \n206    seq_20200204\n207    seq_20200205\n208    seq_20200205\n209    seq_20200205\n210    seq_20200205\nName: 上机时间, Length: 211, dtype: object"
     },
     "metadata": {},
     "execution_count": 35
    }
   ],
   "source": [
    "merge_df_filt[\"上机时间\"].apply(lambda x: pd.to_datetime(x).strftime('seq_%Y%m%d'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "Index(['Pooling号', '送测日期', '上机日期', '下机日期', '机器号', '芯片号', '文库名称', '虚拟入库样本名',\n       '杂交日期', '杂交探针', '杂交时间h', 'PostPCR循环数', 'PostPCR浓度ng/ul', '洗脱体积', '操作人',\n       '备注'],\n      dtype='object')"
     },
     "metadata": {},
     "execution_count": 20
    }
   ],
   "source": [
    "merge_df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "<bound method NDFrame.head of                     Pooling号        送测日期                 上机日期  \\\n0      BGISEQ-50018SZ0002233         NaN                    无   \n4      MGI2000-ZJC&SLL0510-1         NaN  2018-05-12 00:00:00   \n12    BGISEQ-50018SZ0002000          NaN                    无   \n15     BGISEQ-50018SZ0002452         NaN                    无   \n17     BGISEQ-50018SZ0002806         NaN                    无   \n...                      ...         ...                  ...   \n7021                   LS302  20200207.0  2020-02-05 15:48:00   \n7029                   LS303  20200207.0  2020-02-05 15:48:00   \n7037                   LS304  20200207.0  2020-02-05 15:48:00   \n7045                   LS305  20200207.0  2020-02-05 15:48:00   \n7053                   LS306  20200207.0  2020-02-05 15:48:00   \n\n                     下机日期            机器号                     芯片号  \n0                       无              无                       无  \n4     2018-05-16 00:00:00  MGISEQ200016               V100000929  \n12                      无              无                       无  \n15                      无              无                       无  \n17                      无              无                       无  \n...                   ...            ...                     ...  \n7021  2020-02-07 15:30:00  R100400180025  V300044901, V300044737  \n7029  2020-02-07 15:30:00  R100400180025  V300044901, V300044737  \n7037  2020-02-07 15:30:00  R100400180025  V300044901, V300044737  \n7045  2020-02-07 15:30:00  R100400180025  V300044901, V300044737  \n7053  2020-02-07 15:30:00  R100400180025  V300044901, V300044737  \n\n[787 rows x 6 columns]>"
     },
     "metadata": {},
     "execution_count": 8
    }
   ],
   "source": [
    "raw_df_filt.head"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "<bound method NDFrame.head of                        文库名称  虚拟入库样本名        杂交日期  \\\n0    ssM-CpGiant-1-20181218  IDTL130  2018/12/18   \n1    ssM-CpGiant-2-20181218  IDTL131  2018/12/18   \n2    ssM-CpGiant-1-20190104  IDTL142  2019/01/04   \n3    ssM-CpGiant-4-20190104  IDTL145  2019/01/04   \n4    ssM-CpGiant-2-20190109    LM001  2019/01/09   \n..                      ...      ...         ...   \n212  ssM-CpGiant-2-20200214    LM306  2020/02/14   \n213  ssM-CpGiant-3-20200214    LM307  2020/02/14   \n214  ssM-CpGiant-4-20200214    LM308  2020/02/14   \n215  ssM-CpGiant-5-20200214    LM309  2020/02/14   \n216  ssM-CpGiant-6-20200214    LM310  2020/02/14   \n\n                                  杂交探针  杂交时间h  PostPCR循环数 PostPCR浓度ng/ul  \\\n0    SeqCap Epi CpGiant Enrichment Kit   66.5          12           16.2   \n1    SeqCap Epi CpGiant Enrichment Kit   67.5          13           17.6   \n2    SeqCap Epi CpGiant Enrichment Kit   67.0          12            110   \n3    SeqCap Epi CpGiant Enrichment Kit   67.0          12            110   \n4    SeqCap Epi CpGiant Enrichment Kit   70.0          11           38.8   \n..                                 ...    ...         ...            ...   \n212  SeqCap Epi CpGiant Enrichment Kit   70.5           9             46   \n213  SeqCap Epi CpGiant Enrichment Kit   70.5           9           48.8   \n214  SeqCap Epi CpGiant Enrichment Kit   70.5           9           55.6   \n215  SeqCap Epi CpGiant Enrichment Kit   70.5           9             56   \n216  SeqCap Epi CpGiant Enrichment Kit   70.5           9           51.6   \n\n     洗脱体积  操作人  备注  \n0    40.0  NaN NaN  \n1    40.0  NaN NaN  \n2    40.0  NaN NaN  \n3    40.0  NaN NaN  \n4    40.0  NaN NaN  \n..    ...  ...  ..  \n212  40.0  LZL NaN  \n213  40.0  LZL NaN  \n214  40.0  LZL NaN  \n215  40.0  LZL NaN  \n216  40.0  LZL NaN  \n\n[217 rows x 10 columns]>"
     },
     "metadata": {},
     "execution_count": 9
    }
   ],
   "source": [
    "capture_df.head"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7-final"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python36764bit590e0209625c41dab6b137cee9f052d1",
   "display_name": "Python 3.6.7 64-bit"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}